{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "正在爬取第 1 页...\n",
      "正在爬取第 2 页...\n",
      "正在爬取第 3 页...\n",
      "正在爬取第 4 页...\n",
      "正在爬取第 5 页...\n"
     ]
    },
    {
     "ename": "SSLError",
     "evalue": "HTTPSConnectionPool(host='p1.meituan.net', port=443): Max retries exceeded with url: /movie/4c55f3bf5fa9660db3cb7014651a0950267034.jpg@464w_644h_1e_1c (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1006)')))",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mSSLError\u001b[0m                                  Traceback (most recent call last)",
      "\u001b[1;31mSSLError\u001b[0m: [SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1006)",
      "\nThe above exception was the direct cause of the following exception:\n",
      "\u001b[1;31mMaxRetryError\u001b[0m                             Traceback (most recent call last)",
      "File \u001b[1;32md:\\conda\\envs\\MyMLenvAllinOne\\Lib\\site-packages\\requests\\adapters.py:486\u001b[0m, in \u001b[0;36mHTTPAdapter.send\u001b[1;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[0;32m    485\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 486\u001b[0m     resp \u001b[38;5;241m=\u001b[39m conn\u001b[38;5;241m.\u001b[39murlopen(\n\u001b[0;32m    487\u001b[0m         method\u001b[38;5;241m=\u001b[39mrequest\u001b[38;5;241m.\u001b[39mmethod,\n\u001b[0;32m    488\u001b[0m         url\u001b[38;5;241m=\u001b[39murl,\n\u001b[0;32m    489\u001b[0m         body\u001b[38;5;241m=\u001b[39mrequest\u001b[38;5;241m.\u001b[39mbody,\n\u001b[0;32m    490\u001b[0m         headers\u001b[38;5;241m=\u001b[39mrequest\u001b[38;5;241m.\u001b[39mheaders,\n\u001b[0;32m    491\u001b[0m         redirect\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[0;32m    492\u001b[0m         assert_same_host\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[0;32m    493\u001b[0m         preload_content\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[0;32m    494\u001b[0m         decode_content\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[0;32m    495\u001b[0m         retries\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmax_retries,\n\u001b[0;32m    496\u001b[0m         timeout\u001b[38;5;241m=\u001b[39mtimeout,\n\u001b[0;32m    497\u001b[0m         chunked\u001b[38;5;241m=\u001b[39mchunked,\n\u001b[0;32m    498\u001b[0m     )\n\u001b[0;32m    500\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (ProtocolError, \u001b[38;5;167;01mOSError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m err:\n",
      "File \u001b[1;32md:\\conda\\envs\\MyMLenvAllinOne\\Lib\\site-packages\\urllib3\\connectionpool.py:844\u001b[0m, in \u001b[0;36mHTTPConnectionPool.urlopen\u001b[1;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, preload_content, decode_content, **response_kw)\u001b[0m\n\u001b[0;32m    842\u001b[0m     new_e \u001b[38;5;241m=\u001b[39m ProtocolError(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mConnection aborted.\u001b[39m\u001b[38;5;124m\"\u001b[39m, new_e)\n\u001b[1;32m--> 844\u001b[0m retries \u001b[38;5;241m=\u001b[39m retries\u001b[38;5;241m.\u001b[39mincrement(\n\u001b[0;32m    845\u001b[0m     method, url, error\u001b[38;5;241m=\u001b[39mnew_e, _pool\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m, _stacktrace\u001b[38;5;241m=\u001b[39msys\u001b[38;5;241m.\u001b[39mexc_info()[\u001b[38;5;241m2\u001b[39m]\n\u001b[0;32m    846\u001b[0m )\n\u001b[0;32m    847\u001b[0m retries\u001b[38;5;241m.\u001b[39msleep()\n",
      "File \u001b[1;32md:\\conda\\envs\\MyMLenvAllinOne\\Lib\\site-packages\\urllib3\\util\\retry.py:515\u001b[0m, in \u001b[0;36mRetry.increment\u001b[1;34m(self, method, url, response, error, _pool, _stacktrace)\u001b[0m\n\u001b[0;32m    514\u001b[0m     reason \u001b[38;5;241m=\u001b[39m error \u001b[38;5;129;01mor\u001b[39;00m ResponseError(cause)\n\u001b[1;32m--> 515\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m MaxRetryError(_pool, url, reason) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mreason\u001b[39;00m  \u001b[38;5;66;03m# type: ignore[arg-type]\u001b[39;00m\n\u001b[0;32m    517\u001b[0m log\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIncremented Retry for (url=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m): \u001b[39m\u001b[38;5;132;01m%r\u001b[39;00m\u001b[38;5;124m\"\u001b[39m, url, new_retry)\n",
      "\u001b[1;31mMaxRetryError\u001b[0m: HTTPSConnectionPool(host='p1.meituan.net', port=443): Max retries exceeded with url: /movie/4c55f3bf5fa9660db3cb7014651a0950267034.jpg@464w_644h_1e_1c (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1006)')))",
      "\nDuring handling of the above exception, another exception occurred:\n",
      "\u001b[1;31mSSLError\u001b[0m                                  Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[4], line 56\u001b[0m\n\u001b[0;32m     54\u001b[0m base_url \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhttps://ssr1.scrape.center/page\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m     55\u001b[0m num_pages \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m10\u001b[39m  \u001b[38;5;66;03m# 假设总共有10页电影网页\u001b[39;00m\n\u001b[1;32m---> 56\u001b[0m scrape_all_pages(base_url, num_pages)\n",
      "Cell \u001b[1;32mIn[4], line 50\u001b[0m, in \u001b[0;36mscrape_all_pages\u001b[1;34m(base_url, num_pages)\u001b[0m\n\u001b[0;32m     48\u001b[0m     url \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mbase_url\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mi\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m     49\u001b[0m     \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m正在爬取第 \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mi\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m 页...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m---> 50\u001b[0m     scrape_movie_page(url)\n\u001b[0;32m     51\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m所有网页已爬取完成！\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
      "Cell \u001b[1;32mIn[4], line 41\u001b[0m, in \u001b[0;36mscrape_movie_page\u001b[1;34m(url)\u001b[0m\n\u001b[0;32m     39\u001b[0m         img_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mD:/desktop/movie_images\u001b[39m\u001b[38;5;124m\"\u001b[39m, img_name)\n\u001b[0;32m     40\u001b[0m         \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(img_path, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mwb\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m img_file:\n\u001b[1;32m---> 41\u001b[0m             img_file\u001b[38;5;241m.\u001b[39mwrite(requests\u001b[38;5;241m.\u001b[39mget(img_url)\u001b[38;5;241m.\u001b[39mcontent)\n\u001b[0;32m     42\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m     43\u001b[0m     \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m请求失败，状态码:\u001b[39m\u001b[38;5;124m\"\u001b[39m, response\u001b[38;5;241m.\u001b[39mstatus_code)\n",
      "File \u001b[1;32md:\\conda\\envs\\MyMLenvAllinOne\\Lib\\site-packages\\requests\\api.py:73\u001b[0m, in \u001b[0;36mget\u001b[1;34m(url, params, **kwargs)\u001b[0m\n\u001b[0;32m     62\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget\u001b[39m(url, params\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m     63\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124;03m\"\"\"Sends a GET request.\u001b[39;00m\n\u001b[0;32m     64\u001b[0m \n\u001b[0;32m     65\u001b[0m \u001b[38;5;124;03m    :param url: URL for the new :class:`Request` object.\u001b[39;00m\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m     70\u001b[0m \u001b[38;5;124;03m    :rtype: requests.Response\u001b[39;00m\n\u001b[0;32m     71\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[1;32m---> 73\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m request(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mget\u001b[39m\u001b[38;5;124m\"\u001b[39m, url, params\u001b[38;5;241m=\u001b[39mparams, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
      "File \u001b[1;32md:\\conda\\envs\\MyMLenvAllinOne\\Lib\\site-packages\\requests\\api.py:59\u001b[0m, in \u001b[0;36mrequest\u001b[1;34m(method, url, **kwargs)\u001b[0m\n\u001b[0;32m     55\u001b[0m \u001b[38;5;66;03m# By using the 'with' statement we are sure the session is closed, thus we\u001b[39;00m\n\u001b[0;32m     56\u001b[0m \u001b[38;5;66;03m# avoid leaving sockets open which can trigger a ResourceWarning in some\u001b[39;00m\n\u001b[0;32m     57\u001b[0m \u001b[38;5;66;03m# cases, and look like a memory leak in others.\u001b[39;00m\n\u001b[0;32m     58\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m sessions\u001b[38;5;241m.\u001b[39mSession() \u001b[38;5;28;01mas\u001b[39;00m session:\n\u001b[1;32m---> 59\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m session\u001b[38;5;241m.\u001b[39mrequest(method\u001b[38;5;241m=\u001b[39mmethod, url\u001b[38;5;241m=\u001b[39murl, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
      "File \u001b[1;32md:\\conda\\envs\\MyMLenvAllinOne\\Lib\\site-packages\\requests\\sessions.py:589\u001b[0m, in \u001b[0;36mSession.request\u001b[1;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[0;32m    584\u001b[0m send_kwargs \u001b[38;5;241m=\u001b[39m {\n\u001b[0;32m    585\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtimeout\u001b[39m\u001b[38;5;124m\"\u001b[39m: timeout,\n\u001b[0;32m    586\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mallow_redirects\u001b[39m\u001b[38;5;124m\"\u001b[39m: allow_redirects,\n\u001b[0;32m    587\u001b[0m }\n\u001b[0;32m    588\u001b[0m send_kwargs\u001b[38;5;241m.\u001b[39mupdate(settings)\n\u001b[1;32m--> 589\u001b[0m resp \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msend(prep, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39msend_kwargs)\n\u001b[0;32m    591\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m resp\n",
      "File \u001b[1;32md:\\conda\\envs\\MyMLenvAllinOne\\Lib\\site-packages\\requests\\sessions.py:703\u001b[0m, in \u001b[0;36mSession.send\u001b[1;34m(self, request, **kwargs)\u001b[0m\n\u001b[0;32m    700\u001b[0m start \u001b[38;5;241m=\u001b[39m preferred_clock()\n\u001b[0;32m    702\u001b[0m \u001b[38;5;66;03m# Send the request\u001b[39;00m\n\u001b[1;32m--> 703\u001b[0m r \u001b[38;5;241m=\u001b[39m adapter\u001b[38;5;241m.\u001b[39msend(request, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m    705\u001b[0m \u001b[38;5;66;03m# Total elapsed time of the request (approximately)\u001b[39;00m\n\u001b[0;32m    706\u001b[0m elapsed \u001b[38;5;241m=\u001b[39m preferred_clock() \u001b[38;5;241m-\u001b[39m start\n",
      "File \u001b[1;32md:\\conda\\envs\\MyMLenvAllinOne\\Lib\\site-packages\\requests\\adapters.py:517\u001b[0m, in \u001b[0;36mHTTPAdapter.send\u001b[1;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[0;32m    513\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m ProxyError(e, request\u001b[38;5;241m=\u001b[39mrequest)\n\u001b[0;32m    515\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(e\u001b[38;5;241m.\u001b[39mreason, _SSLError):\n\u001b[0;32m    516\u001b[0m         \u001b[38;5;66;03m# This branch is for urllib3 v1.22 and later.\u001b[39;00m\n\u001b[1;32m--> 517\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m SSLError(e, request\u001b[38;5;241m=\u001b[39mrequest)\n\u001b[0;32m    519\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mConnectionError\u001b[39;00m(e, request\u001b[38;5;241m=\u001b[39mrequest)\n\u001b[0;32m    521\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m ClosedPoolError \u001b[38;5;28;01mas\u001b[39;00m e:\n",
      "\u001b[1;31mSSLError\u001b[0m: HTTPSConnectionPool(host='p1.meituan.net', port=443): Max retries exceeded with url: /movie/4c55f3bf5fa9660db3cb7014651a0950267034.jpg@464w_644h_1e_1c (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1006)')))"
     ]
    }
   ],
   "source": [
    "import requests\n",
    "from bs4 import BeautifulSoup\n",
    "import csv\n",
    "import codecs\n",
    "import os\n",
    "\n",
    "# 创建保存图片的文件夹\n",
    "if not os.path.exists(\"D:/desktop/movie_images\"):\n",
    "    os.makedirs(\"D:/desktop/movie_images\")\n",
    "\n",
    "# 添加 BOM 头以避免在 Windows 下出现乱码\n",
    "with open(\"D:/desktop/movies.csv\", \"ab+\") as fp:\n",
    "    fp.write(codecs.BOM_UTF8)  \n",
    "\n",
    "# 保存数据到CSV文件\n",
    "def save_to_csv(data):\n",
    "    with open('D:/desktop/movies.csv', 'a', newline='', encoding='utf-8') as f:\n",
    "        writer = csv.writer(f)\n",
    "        writer.writerow(data)\n",
    "\n",
    "# 爬取电影网页并提取信息\n",
    "def scrape_movie_page(url):\n",
    "    response = requests.get(url)\n",
    "    if response.status_code == 200:\n",
    "        soup = BeautifulSoup(response.text, 'html.parser')\n",
    "        movies = soup.find_all(\"div\", class_=\"el-card item m-t is-hover-shadow\")\n",
    "        for movie in movies:\n",
    "            name = movie.find(\"h2\", class_=\"m-b-sm\").text.strip()\n",
    "            score = movie.find(\"p\", class_=\"score m-t-md m-b-n-sm\").text.strip()\n",
    "            categories = [button.text.strip() for button in movie.find_all(\"button\", class_=\"el-button category el-button--primary el-button--mini\")]\n",
    "            release_date = movie.find_all(\"div\", class_=\"m-v-sm info\")[-1].text.strip()\n",
    "            countries = [span.text.strip() for span in movie.find_all(\"span\", attrs={\"data-v-7f856186\": \"\"})]\n",
    "            duration = movie.find(\"span\", attrs={\"data-v-7f856186\": \"\"}).text.strip()\n",
    "            data = [name, score, categories, release_date, countries, duration]\n",
    "            save_to_csv(data)\n",
    "            # 下载图片\n",
    "            img_url = movie.find(\"img\")[\"src\"]\n",
    "            img_name = img_url.split(\"/\")[-1]\n",
    "            img_path = os.path.join(\"D:/desktop/movie_images\", img_name)\n",
    "            with open(img_path, 'wb') as img_file:\n",
    "                img_file.write(requests.get(img_url).content)\n",
    "    else:\n",
    "        print(\"请求失败，状态码:\", response.status_code)\n",
    "\n",
    "# 循环爬取所有网页\n",
    "def scrape_all_pages(base_url, num_pages):\n",
    "    for i in range(1, num_pages + 1):\n",
    "        url = f\"{base_url}/{i}\"\n",
    "        print(f\"正在爬取第 {i} 页...\")\n",
    "        scrape_movie_page(url)\n",
    "    print(\"所有网页已爬取完成！\")\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    base_url = \"https://ssr1.scrape.center/page\"\n",
    "    num_pages = 10  # 假设总共有10页电影网页\n",
    "    scrape_all_pages(base_url, num_pages)\n",
    "\n",
    "\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "MyMLenvAllinOne",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
